knitr::opts_chunk$set(echo = TRUE)
library(reticulate)
# use_python("/Users/oldemarrodriguez/anaconda3/bin/python3.7") # PROMIDAT
use_python("/anaconda3/bin/python3.6") ## Portátil
import os
import graphviz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
En Mac (Terminal):
pip install graphviz
conda install graphviz
En Windows (Anaconda Prompt):
pip install graphviz
conda install graphviz
Para mostrar un gráfico en un archivo rmd de un árbol
def graficar_arbol(grafico = None):
grafico.format = "png"
archivo = grafico.render()
img = mpimg.imread(archivo)
imgplot = plt.imshow(img)
plt.axis('off')
plt.show()
plt.close()
Índices para matrices NxN
def indices_general(MC, nombres = None):
precision_global = np.sum(MC.diagonal()) / np.sum(MC)
error_global = 1 - precision_global
precision_categoria = pd.DataFrame(MC.diagonal()/np.sum(MC,axis = 1)).T
if nombres!=None:
precision_categoria.columns = nombres
return {"Matriz de Confusión":MC,
"Precisión Global":precision_global,
"Error Global":error_global,
"Precisión por Categoría":precision_categoria}
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
## /Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('iris.csv',delimiter=';',decimal=".")
print(datos.shape)
## (150, 5)
print(datos.head())
## s.largo s.ancho p.largo p.ancho tipo
## 0 5.1 3.5 1.4 0.2 setosa
## 1 4.9 3.0 1.4 0.2 setosa
## 2 4.7 3.2 1.3 0.2 setosa
## 3 4.6 3.1 1.5 0.2 setosa
## 4 5.0 3.6 1.4 0.2 setosa
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 150 entries, 0 to 149
## Data columns (total 5 columns):
## s.largo 150 non-null float64
## s.ancho 150 non-null float64
## p.largo 150 non-null float64
## p.ancho 150 non-null float64
## tipo 150 non-null object
## dtypes: float64(4), object(1)
## memory usage: 5.9+ KB
## None
Elimina la variable catégorica, deja las variables predictoras en X
X = datos.iloc[:,:4]
print(X.head())
## s.largo s.ancho p.largo p.ancho
## 0 5.1 3.5 1.4 0.2
## 1 4.9 3.0 1.4 0.2
## 2 4.7 3.2 1.3 0.2
## 3 4.6 3.1 1.5 0.2
## 4 5.0 3.6 1.4 0.2
Deja la variable a predecir en y
y = datos.iloc[:,4:5]
print(y.head())
## tipo
## 0 setosa
## 1 setosa
## 2 setosa
## 3 setosa
## 4 setosa
Con el 70% de los datos para entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)
## /anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
## FutureWarning)
print(X_train)
## s.largo s.ancho p.largo p.ancho
## 60 5.0 2.0 3.5 1.0
## 116 6.5 3.0 5.5 1.8
## 144 6.7 3.3 5.7 2.5
## 119 6.0 2.2 5.0 1.5
## 108 6.7 2.5 5.8 1.8
## 69 5.6 2.5 3.9 1.1
## 135 7.7 3.0 6.1 2.3
## 56 6.3 3.3 4.7 1.6
## 80 5.5 2.4 3.8 1.1
## 123 6.3 2.7 4.9 1.8
## 133 6.3 2.8 5.1 1.5
## 106 4.9 2.5 4.5 1.7
## 146 6.3 2.5 5.0 1.9
## 50 7.0 3.2 4.7 1.4
## 147 6.5 3.0 5.2 2.0
## 85 6.0 3.4 4.5 1.6
## 30 4.8 3.1 1.6 0.2
## 101 5.8 2.7 5.1 1.9
## 94 5.6 2.7 4.2 1.3
## 64 5.6 2.9 3.6 1.3
## 89 5.5 2.5 4.0 1.3
## 91 6.1 3.0 4.6 1.4
## 125 7.2 3.2 6.0 1.8
## 48 5.3 3.7 1.5 0.2
## 13 4.3 3.0 1.1 0.1
## 111 6.4 2.7 5.3 1.9
## 95 5.7 3.0 4.2 1.2
## 20 5.4 3.4 1.7 0.2
## 15 5.7 4.4 1.5 0.4
## 52 6.9 3.1 4.9 1.5
## .. ... ... ... ...
## 14 5.8 4.0 1.2 0.2
## 122 7.7 2.8 6.7 2.0
## 19 5.1 3.8 1.5 0.3
## 29 4.7 3.2 1.6 0.2
## 130 7.4 2.8 6.1 1.9
## 49 5.0 3.3 1.4 0.2
## 136 6.3 3.4 5.6 2.4
## 99 5.7 2.8 4.1 1.3
## 82 5.8 2.7 3.9 1.2
## 79 5.7 2.6 3.5 1.0
## 115 6.4 3.2 5.3 2.3
## 145 6.7 3.0 5.2 2.3
## 72 6.3 2.5 4.9 1.5
## 77 6.7 3.0 5.0 1.7
## 25 5.0 3.0 1.6 0.2
## 81 5.5 2.4 3.7 1.0
## 140 6.7 3.1 5.6 2.4
## 142 5.8 2.7 5.1 1.9
## 39 5.1 3.4 1.5 0.2
## 58 6.6 2.9 4.6 1.3
## 88 5.6 3.0 4.1 1.3
## 70 5.9 3.2 4.8 1.8
## 87 6.3 2.3 4.4 1.3
## 36 5.5 3.5 1.3 0.2
## 21 5.1 3.7 1.5 0.4
## 9 4.9 3.1 1.5 0.1
## 103 6.3 2.9 5.6 1.8
## 67 5.8 2.7 4.1 1.0
## 117 7.7 3.8 6.7 2.2
## 47 4.6 3.2 1.4 0.2
##
## [105 rows x 4 columns]
print(X_test)
## s.largo s.ancho p.largo p.ancho
## 114 5.8 2.8 5.1 2.4
## 62 6.0 2.2 4.0 1.0
## 33 5.5 4.2 1.4 0.2
## 107 7.3 2.9 6.3 1.8
## 7 5.0 3.4 1.5 0.2
## 100 6.3 3.3 6.0 2.5
## 40 5.0 3.5 1.3 0.3
## 86 6.7 3.1 4.7 1.5
## 76 6.8 2.8 4.8 1.4
## 71 6.1 2.8 4.0 1.3
## 134 6.1 2.6 5.6 1.4
## 51 6.4 3.2 4.5 1.5
## 73 6.1 2.8 4.7 1.2
## 54 6.5 2.8 4.6 1.5
## 63 6.1 2.9 4.7 1.4
## 37 4.9 3.1 1.5 0.1
## 78 6.0 2.9 4.5 1.5
## 90 5.5 2.6 4.4 1.2
## 45 4.8 3.0 1.4 0.3
## 16 5.4 3.9 1.3 0.4
## 121 5.6 2.8 4.9 2.0
## 66 5.6 3.0 4.5 1.5
## 24 4.8 3.4 1.9 0.2
## 8 4.4 2.9 1.4 0.2
## 126 6.2 2.8 4.8 1.8
## 22 4.6 3.6 1.0 0.2
## 44 5.1 3.8 1.9 0.4
## 97 6.2 2.9 4.3 1.3
## 93 5.0 2.3 3.3 1.0
## 26 5.0 3.4 1.6 0.4
## 137 6.4 3.1 5.5 1.8
## 84 5.4 3.0 4.5 1.5
## 27 5.2 3.5 1.5 0.2
## 127 6.1 3.0 4.9 1.8
## 132 6.4 2.8 5.6 2.2
## 59 5.2 2.7 3.9 1.4
## 18 5.7 3.8 1.7 0.3
## 83 6.0 2.7 5.1 1.6
## 61 5.9 3.0 4.2 1.5
## 92 5.8 2.6 4.0 1.2
## 112 6.8 3.0 5.5 2.1
## 2 4.7 3.2 1.3 0.2
## 141 6.9 3.1 5.1 2.3
## 43 5.0 3.5 1.6 0.6
## 10 5.4 3.7 1.5 0.2
print(y_train)
## tipo
## 60 versicolor
## 116 virginica
## 144 virginica
## 119 virginica
## 108 virginica
## 69 versicolor
## 135 virginica
## 56 versicolor
## 80 versicolor
## 123 virginica
## 133 virginica
## 106 virginica
## 146 virginica
## 50 versicolor
## 147 virginica
## 85 versicolor
## 30 setosa
## 101 virginica
## 94 versicolor
## 64 versicolor
## 89 versicolor
## 91 versicolor
## 125 virginica
## 48 setosa
## 13 setosa
## 111 virginica
## 95 versicolor
## 20 setosa
## 15 setosa
## 52 versicolor
## .. ...
## 14 setosa
## 122 virginica
## 19 setosa
## 29 setosa
## 130 virginica
## 49 setosa
## 136 virginica
## 99 versicolor
## 82 versicolor
## 79 versicolor
## 115 virginica
## 145 virginica
## 72 versicolor
## 77 versicolor
## 25 setosa
## 81 versicolor
## 140 virginica
## 142 virginica
## 39 setosa
## 58 versicolor
## 88 versicolor
## 70 versicolor
## 87 versicolor
## 36 setosa
## 21 setosa
## 9 setosa
## 103 virginica
## 67 versicolor
## 117 virginica
## 47 setosa
##
## [105 rows x 1 columns]
print(y_test)
## tipo
## 114 virginica
## 62 versicolor
## 33 setosa
## 107 virginica
## 7 setosa
## 100 virginica
## 40 setosa
## 86 versicolor
## 76 versicolor
## 71 versicolor
## 134 virginica
## 51 versicolor
## 73 versicolor
## 54 versicolor
## 63 versicolor
## 37 setosa
## 78 versicolor
## 90 versicolor
## 45 setosa
## 16 setosa
## 121 virginica
## 66 versicolor
## 24 setosa
## 8 setosa
## 126 virginica
## 22 setosa
## 44 setosa
## 97 versicolor
## 93 versicolor
## 26 setosa
## 137 virginica
## 84 versicolor
## 27 setosa
## 127 virginica
## 132 virginica
## 59 versicolor
## 18 setosa
## 83 versicolor
## 61 versicolor
## 92 versicolor
## 112 virginica
## 2 setosa
## 141 virginica
## 43 setosa
## 10 setosa
Mediante el constructor inicializa la instancia_arbol
instancia_arbol = DecisionTreeClassifier(random_state=0)
Entrena el modelo llamando al método fit
Nota:Esto se debe a que al ser Python orientado a Objetos,el modelo queda en un atributo de la instancia “instancia_arbol” llama el método fit de cla clase DecisionTreeClassifier.
instancia_arbol.fit(X_train,y_train)
Imprime las predicciones en testing
print("Las predicciones en Testing son: {}".format(instancia_arbol.predict(X_test)))
## Las predicciones en Testing son: ['virginica' 'versicolor' 'setosa' 'virginica' 'setosa' 'virginica'
## 'setosa' 'versicolor' 'versicolor' 'versicolor' 'virginica' 'versicolor'
## 'versicolor' 'versicolor' 'versicolor' 'setosa' 'versicolor' 'versicolor'
## 'setosa' 'setosa' 'virginica' 'versicolor' 'setosa' 'setosa' 'virginica'
## 'setosa' 'setosa' 'versicolor' 'versicolor' 'setosa' 'virginica'
## 'versicolor' 'setosa' 'virginica' 'virginica' 'versicolor' 'setosa'
## 'virginica' 'versicolor' 'versicolor' 'virginica' 'setosa' 'virginica'
## 'setosa' 'setosa']
Porcentaje de predicción global
print("Precisión en Testing: {:.3f}".format(instancia_arbol.score(X_test, y_test)))
## Precisión en Testing: 0.978
Matriz de confusión
prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
print("Matriz de Confusión:\n{}".format(MC))
## Matriz de Confusión:
## [[16 0 0]
## [ 0 17 1]
## [ 0 0 11]]
indices = indices_general(MC,list(np.unique(y)))
for k in indices:
print("\n%s:\n%s"%(k,str(indices[k])))
##
## Matriz de Confusión:
## [[16 0 0]
## [ 0 17 1]
## [ 0 0 11]]
##
## Precisión Global:
## 0.9777777777777777
##
## Error Global:
## 0.022222222222222254
##
## Precisión por Categoría:
## setosa versicolor virginica
## 0 1.0 0.944444 1.0
Graficando el árbol
dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["Setosa", "Virginica", "Versicolor"],
feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
## /Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('MuestraCredito5000.csv',delimiter=';',decimal=".")
print(datos.shape)
## (5000, 6)
print(datos.head())
## MontoCredito IngresoNeto ... GradoAcademico BuenPagador
## 0 1 1 ... 1 Si
## 1 3 1 ... 1 Si
## 2 2 1 ... 1 Si
## 3 1 2 ... 1 Si
## 4 1 1 ... 1 Si
##
## [5 rows x 6 columns]
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 5000 entries, 0 to 4999
## Data columns (total 6 columns):
## MontoCredito 5000 non-null int64
## IngresoNeto 5000 non-null int64
## CoefCreditoAvaluo 5000 non-null int64
## MontoCuota 5000 non-null int64
## GradoAcademico 5000 non-null int64
## BuenPagador 5000 non-null object
## dtypes: int64(5), object(1)
## memory usage: 234.5+ KB
## None
Nota: Está tomando erroneamente los datos como numéricos, en este caso se deben convertir las variables ccategóricas porque en realidad estos numéros son códigos NO es que siempre se deban convertir las variables numéricas a categórica.
datos['MontoCredito'] = datos['MontoCredito'].astype('category')
datos['IngresoNeto'] = datos['IngresoNeto'].astype('category')
datos['CoefCreditoAvaluo'] = datos['CoefCreditoAvaluo'].astype('category')
datos['MontoCuota'] = datos['MontoCuota'].astype('category')
datos['GradoAcademico'] = datos['GradoAcademico'].astype('category')
print(datos.head())
## MontoCredito IngresoNeto ... GradoAcademico BuenPagador
## 0 1 1 ... 1 Si
## 1 3 1 ... 1 Si
## 2 2 1 ... 1 Si
## 3 1 2 ... 1 Si
## 4 1 1 ... 1 Si
##
## [5 rows x 6 columns]
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 5000 entries, 0 to 4999
## Data columns (total 6 columns):
## MontoCredito 5000 non-null category
## IngresoNeto 5000 non-null category
## CoefCreditoAvaluo 5000 non-null category
## MontoCuota 5000 non-null category
## GradoAcademico 5000 non-null category
## BuenPagador 5000 non-null object
## dtypes: category(5), object(1)
## memory usage: 64.5+ KB
## None
Elimina la variable categórica, deja las variables predictoras en X
X = datos.iloc[:,:5]
print(X.head())
## MontoCredito IngresoNeto ... MontoCuota GradoAcademico
## 0 1 1 ... 1 1
## 1 3 1 ... 1 1
## 2 2 1 ... 1 1
## 3 1 2 ... 1 1
## 4 1 1 ... 1 1
##
## [5 rows x 5 columns]
Deja la variable a predecir en y
y = datos.iloc[:,5:6]
print(y.head())
## BuenPagador
## 0 Si
## 1 Si
## 2 Si
## 3 Si
## 4 Si
Con el 75% de los datos para entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0)
## /anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
## FutureWarning)
Mediante el constructor inicializa la instancia
instancia_arbol = DecisionTreeClassifier(random_state=0)
instancia_arbol.fit(X_train,y_train)
Imprime las predicciones en testing
print("Las predicciones en Testing son: {}".format(instancia_arbol.predict(X_test)))
## Las predicciones en Testing son: ['Si' 'Si' 'Si' ... 'Si' 'Si' 'No']
Porcentaje de predicción global
print("Precisión en Testing: {:.2f}".format(instancia_arbol.score(X_test, y_test)))
## Precisión en Testing: 0.95
Matriz de confusión
prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
print("Matriz de Confusión:\n{}".format(MC))
## Matriz de Confusión:
## [[ 119 47]
## [ 13 1071]]
indices = indices_general(MC,list(np.unique(y)))
for k in indices:
print("\n%s:\n%s"%(k,str(indices[k])))
##
## Matriz de Confusión:
## [[ 119 47]
## [ 13 1071]]
##
## Precisión Global:
## 0.952
##
## Error Global:
## 0.04800000000000004
##
## Precisión por Categoría:
## No Si
## 0 0.716867 0.988007
Graficando el árbol
dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)
Nota: El árbol es demasiado grande porque min_samples_leaf=1
Vamos a podar el árbol, puede ser que la calidad de la predicción sea menor
instancia_arbol = DecisionTreeClassifier(random_state=0,min_samples_leaf=150)
print(instancia_arbol)
## DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
## max_features=None, max_leaf_nodes=None,
## min_impurity_decrease=0.0, min_impurity_split=None,
## min_samples_leaf=150, min_samples_split=2,
## min_weight_fraction_leaf=0.0, presort=False, random_state=0,
## splitter='best')
Entrena el modelo llamando al método fit
Observe que no hay variable que guarde el modelo como en R, esto se debe a que al ser Python orientado a Objetos, el modelo queda en un atributo de la instancia “instancia_arbol” por defecto:
instancia_arbol.fit(X_train,y_train)
Imprime las predicciones en testing
print("Las predicciones en Testing son: {}".format(instancia_arbol.predict(X_test)))
## Las predicciones en Testing son: ['Si' 'Si' 'No' ... 'Si' 'Si' 'No']
Porcentaje de predicción global
print("Precisión en Testing: {:.2f}".format(instancia_arbol.score(X_test, y_test)))
## Precisión en Testing: 0.92
Matriz de confusión
prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
print("Matriz de Confusión:\n{}".format(MC))
## Matriz de Confusión:
## [[ 90 76]
## [ 26 1058]]
Índices de calidad
indices = indices_general(MC,list(np.unique(y)))
for k in indices:
print("\n%s:\n%s"%(k,str(indices[k])))
##
## Matriz de Confusión:
## [[ 90 76]
## [ 26 1058]]
##
## Precisión Global:
## 0.9184
##
## Error Global:
## 0.0816
##
## Precisión por Categoría:
## No Si
## 0 0.542169 0.976015
Graficando el árbol
dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)
class sklearn.tree.DecisionTreeClassifier(criterion=’gini’, splitter=’best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False)[source]
criterion: string, optional (default=”gini”) The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.
random_state: int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator.
max_depth: int or None, optional (default=None) The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
min_samples_split: int, float, optional (default=2) The minimum number of samples required to split an internal node:
min_samples_leaf: int, float, optional (default=1) The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.
max_features: int, float, string or None, optional (default=None) The number of features to consider when looking for the best split:
If int, then consider max_features features at each split. If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split. If “auto”, then max_features=sqrt(n_features). If “sqrt”, then max_features=sqrt(n_features). If “log2”, then max_features=log2(n_features). If None, then max_features=n_features. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than max_features features.
max_leaf_nodes: int or None, optional (default=None) Grow a tree with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.
min_impurity_decrease: float, optional (default=0.) A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
classes_: array of shape = [n_classes] or a list of such arrays The classes labels (single output problem), or a list of arrays of class labels (multi-output problem).
feature_importances_: array of shape = [n_features] Return the feature importances.
n_classes_: int or list The number of classes (for single output problems), or a list containing the number of classes for each output (for multi-output problems).
n_features_: int The number of features when fit is performed.
instancia_arbol = DecisionTreeClassifier(criterion="entropy",)
print(instancia_arbol)
## DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
## max_features=None, max_leaf_nodes=None,
## min_impurity_decrease=0.0, min_impurity_split=None,
## min_samples_leaf=1, min_samples_split=2,
## min_weight_fraction_leaf=0.0, presort=False, random_state=None,
## splitter='best')
Entrena el modelo llamando al método fit
Observe que no hay variable que guarde el modelo como en R, esto se debe a que al ser Python orientado a Objetos, el modelo queda en un atributo de la instancia “instancia_arbol” por defecto:
instancia_arbol.fit(X_train,y_train)
Imprime las predicciones en testing
print("Las predicciones en Testing son: {}".format(instancia_arbol.predict(X_test)))
## Las predicciones en Testing son: ['Si' 'Si' 'No' ... 'Si' 'Si' 'No']
Porcentaje de predicción global
print("Precisión en Testing: {:.2f}".format(instancia_arbol.score(X_test, y_test)))
## Precisión en Testing: 0.95
Matriz de confusión
prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
print("Matriz de Confusión:\n{}".format(MC))
## Matriz de Confusión:
## [[ 120 46]
## [ 13 1071]]
Índices de calidad
indices = indices_general(MC,list(np.unique(y)))
for k in indices:
print("\n%s:\n%s"%(k,str(indices[k])))
##
## Matriz de Confusión:
## [[ 120 46]
## [ 13 1071]]
##
## Precisión Global:
## 0.9528
##
## Error Global:
## 0.04720000000000002
##
## Precisión por Categoría:
## No Si
## 0 0.722892 0.988007
Graficando el árbol
dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
## /Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('SAheart.csv',delimiter=';',decimal=".")
print(datos.shape)
## (462, 10)
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp 462 non-null int64
## tobacco 462 non-null float64
## ldl 462 non-null float64
## adiposity 462 non-null float64
## famhist 462 non-null object
## typea 462 non-null int64
## obesity 462 non-null float64
## alcohol 462 non-null float64
## age 462 non-null int64
## chd 462 non-null object
## dtypes: float64(5), int64(3), object(2)
## memory usage: 36.2+ KB
## None
print(datos.head())
# Convierte las variables de object a categórica
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 0 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
## 1 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
## 2 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
## 3 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
## 4 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
datos['famhist'] = datos['famhist'].astype('category')
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp 462 non-null int64
## tobacco 462 non-null float64
## ldl 462 non-null float64
## adiposity 462 non-null float64
## famhist 462 non-null category
## typea 462 non-null int64
## obesity 462 non-null float64
## alcohol 462 non-null float64
## age 462 non-null int64
## chd 462 non-null object
## dtypes: category(1), float64(5), int64(3), object(1)
## memory usage: 33.1+ KB
## None
print(datos.head())
# Recodifica las categorías usando numéros
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 0 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
## 1 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
## 2 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
## 3 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
## 4 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
datos["famhist"] = datos["famhist"].cat.codes
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp 462 non-null int64
## tobacco 462 non-null float64
## ldl 462 non-null float64
## adiposity 462 non-null float64
## famhist 462 non-null int8
## typea 462 non-null int64
## obesity 462 non-null float64
## alcohol 462 non-null float64
## age 462 non-null int64
## chd 462 non-null object
## dtypes: float64(5), int64(3), int8(1), object(1)
## memory usage: 33.0+ KB
## None
print(datos.head())
# Convierte las variables de entero a categórica
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 0 160 12.00 5.73 23.11 1 49 25.30 97.20 52 Si
## 1 144 0.01 4.41 28.61 0 55 28.87 2.06 63 Si
## 2 118 0.08 3.48 32.28 1 52 29.14 3.81 46 No
## 3 170 7.50 6.41 38.03 1 51 31.99 24.26 58 Si
## 4 134 13.60 3.50 27.78 1 60 25.99 57.34 49 Si
datos['famhist'] = datos['famhist'].astype('category')
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp 462 non-null int64
## tobacco 462 non-null float64
## ldl 462 non-null float64
## adiposity 462 non-null float64
## famhist 462 non-null category
## typea 462 non-null int64
## obesity 462 non-null float64
## alcohol 462 non-null float64
## age 462 non-null int64
## chd 462 non-null object
## dtypes: category(1), float64(5), int64(3), object(1)
## memory usage: 33.1+ KB
## None
print(datos.head())
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 0 160 12.00 5.73 23.11 1 49 25.30 97.20 52 Si
## 1 144 0.01 4.41 28.61 0 55 28.87 2.06 63 Si
## 2 118 0.08 3.48 32.28 1 52 29.14 3.81 46 No
## 3 170 7.50 6.41 38.03 1 51 31.99 24.26 58 Si
## 4 134 13.60 3.50 27.78 1 60 25.99 57.34 49 Si
X = datos.iloc[:,:9]
print(X.head())
## sbp tobacco ldl adiposity famhist typea obesity alcohol age
## 0 160 12.00 5.73 23.11 1 49 25.30 97.20 52
## 1 144 0.01 4.41 28.61 0 55 28.87 2.06 63
## 2 118 0.08 3.48 32.28 1 52 29.14 3.81 46
## 3 170 7.50 6.41 38.03 1 51 31.99 24.26 58
## 4 134 13.60 3.50 27.78 1 60 25.99 57.34 49
y = datos.iloc[:,9:10]
print(y.head())
## chd
## 0 Si
## 1 Si
## 2 No
## 3 Si
## 4 Si
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=0)
## /anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
## FutureWarning)
instancia_arbol = DecisionTreeClassifier(criterion="gini")
print(instancia_arbol)
## DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
## max_features=None, max_leaf_nodes=None,
## min_impurity_decrease=0.0, min_impurity_split=None,
## min_samples_leaf=1, min_samples_split=2,
## min_weight_fraction_leaf=0.0, presort=False, random_state=None,
## splitter='best')
instancia_arbol.fit(X_train,y_train)
prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
indices = indices_general(MC,list(np.unique(y)))
for k in indices:
print("\n%s:\n%s"%(k,str(indices[k])))
##
## Matriz de Confusión:
## [[49 12]
## [11 21]]
##
## Precisión Global:
## 0.7526881720430108
##
## Error Global:
## 0.24731182795698925
##
## Precisión por Categoría:
## No Si
## 0 0.803279 0.65625
dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
## /Users/oldemarrodriguez/Google Drive/MDCurso/Datos
datos = pd.read_csv('SAheart.csv',delimiter=';',decimal=".")
print(datos.shape)
## (462, 10)
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp 462 non-null int64
## tobacco 462 non-null float64
## ldl 462 non-null float64
## adiposity 462 non-null float64
## famhist 462 non-null object
## typea 462 non-null int64
## obesity 462 non-null float64
## alcohol 462 non-null float64
## age 462 non-null int64
## chd 462 non-null object
## dtypes: float64(5), int64(3), object(2)
## memory usage: 36.2+ KB
## None
print(datos.head())
# Convierte las variables de object a categórica
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 0 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
## 1 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
## 2 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
## 3 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
## 4 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
datos['famhist'] = datos['famhist'].astype('category')
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp 462 non-null int64
## tobacco 462 non-null float64
## ldl 462 non-null float64
## adiposity 462 non-null float64
## famhist 462 non-null category
## typea 462 non-null int64
## obesity 462 non-null float64
## alcohol 462 non-null float64
## age 462 non-null int64
## chd 462 non-null object
## dtypes: category(1), float64(5), int64(3), object(1)
## memory usage: 33.1+ KB
## None
print(datos.head())
# Recodifica las categorías usando números
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 0 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
## 1 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
## 2 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
## 3 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
## 4 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
datos["famhist"] = datos["famhist"].cat.codes
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp 462 non-null int64
## tobacco 462 non-null float64
## ldl 462 non-null float64
## adiposity 462 non-null float64
## famhist 462 non-null int8
## typea 462 non-null int64
## obesity 462 non-null float64
## alcohol 462 non-null float64
## age 462 non-null int64
## chd 462 non-null object
## dtypes: float64(5), int64(3), int8(1), object(1)
## memory usage: 33.0+ KB
## None
print(datos.head())
# Convierte las variables de entero a categórica
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 0 160 12.00 5.73 23.11 1 49 25.30 97.20 52 Si
## 1 144 0.01 4.41 28.61 0 55 28.87 2.06 63 Si
## 2 118 0.08 3.48 32.28 1 52 29.14 3.81 46 No
## 3 170 7.50 6.41 38.03 1 51 31.99 24.26 58 Si
## 4 134 13.60 3.50 27.78 1 60 25.99 57.34 49 Si
datos['famhist'] = datos['famhist'].astype('category')
print(datos.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 462 entries, 0 to 461
## Data columns (total 10 columns):
## sbp 462 non-null int64
## tobacco 462 non-null float64
## ldl 462 non-null float64
## adiposity 462 non-null float64
## famhist 462 non-null category
## typea 462 non-null int64
## obesity 462 non-null float64
## alcohol 462 non-null float64
## age 462 non-null int64
## chd 462 non-null object
## dtypes: category(1), float64(5), int64(3), object(1)
## memory usage: 33.1+ KB
## None
print(datos.head())
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 0 160 12.00 5.73 23.11 1 49 25.30 97.20 52 Si
## 1 144 0.01 4.41 28.61 0 55 28.87 2.06 63 Si
## 2 118 0.08 3.48 32.28 1 52 29.14 3.81 46 No
## 3 170 7.50 6.41 38.03 1 51 31.99 24.26 58 Si
## 4 134 13.60 3.50 27.78 1 60 25.99 57.34 49 Si
X = datos.iloc[:,:9]
print(X.head())
## sbp tobacco ldl adiposity famhist typea obesity alcohol age
## 0 160 12.00 5.73 23.11 1 49 25.30 97.20 52
## 1 144 0.01 4.41 28.61 0 55 28.87 2.06 63
## 2 118 0.08 3.48 32.28 1 52 29.14 3.81 46
## 3 170 7.50 6.41 38.03 1 51 31.99 24.26 58
## 4 134 13.60 3.50 27.78 1 60 25.99 57.34 49
y = datos.iloc[:,9:10]
print(y.head())
## chd
## 0 Si
## 1 Si
## 2 No
## 3 Si
## 4 Si
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=0)
## /anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
## FutureWarning)
instancia_arbol = DecisionTreeClassifier(max_depth=4,criterion="gini")
print(instancia_arbol)
## DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
## max_features=None, max_leaf_nodes=None,
## min_impurity_decrease=0.0, min_impurity_split=None,
## min_samples_leaf=1, min_samples_split=2,
## min_weight_fraction_leaf=0.0, presort=False, random_state=None,
## splitter='best')
instancia_arbol.fit(X_train,y_train)
prediccion = instancia_arbol.predict(X_test)
MC = confusion_matrix(y_test, prediccion)
indices = indices_general(MC,list(np.unique(y)))
for k in indices:
print("\n%s:\n%s"%(k,str(indices[k])))
##
## Matriz de Confusión:
## [[53 8]
## [25 7]]
##
## Precisión Global:
## 0.6451612903225806
##
## Error Global:
## 0.3548387096774194
##
## Precisión por Categoría:
## No Si
## 0 0.868852 0.21875
dot_data = export_graphviz(instancia_arbol, out_file=None,class_names=["No", "Si"],
feature_names=list(X.columns.values), filled=True)
grafico = graphviz.Source(dot_data)
graficar_arbol(grafico)